home
***
CD-ROM
|
disk
|
FTP
|
other
***
search
/
Atari Mega Archive 1
/
Atari Mega Archive - Volume 1.iso
/
mint
/
utils
/
agrepsrc.zoo
/
sgrep.c
< prev
next >
Wrap
C/C++ Source or Header
|
1993-01-19
|
19KB
|
686 lines
/* Copyright (c) 1991 Sun Wu and Udi Manber. All Rights Reserved. */
#include <stdio.h>
#define MAXSYM 256
#define MAXMEMBER 8192
#define CHARTYPE unsigned char
#define MaxError 20
#define MAXPATT 256
#define MAXLINE 1024
#define MaxCan 2048
#define BLOCKSIZE 8192
#define MAX_SHIFT_2 4096
#define ON 1
#define LOG_ASCII 8
#define LOG_DNA 3
#define MAXMEMBER_1 65536
#define LONG_EXAC 20
#define LONG_APPX 24
#define W_DELIM 2
extern COUNT, FNAME, SILENT, FILENAMEONLY, num_of_matched;
extern DNA ; /* DNA flag is set in checksg when pattern is DNA pattern and
p_size > 16 */
extern WORDBOUND, WHOLELINE, NOUPPER;
extern unsigned char CurrentFileName[], Progname[];
extern unsigned Mask[];
extern unsigned endposition;
unsigned char BSize; /* log_c m */
unsigned char char_map[MAXSYM];
/* data area */
int shift_1;
CHARTYPE SHIFT[MAXSYM];
CHARTYPE MEMBER[MAXMEMBER];
CHARTYPE pat[MAXPATT];
unsigned Hashmask;
char MEMBER_1[MAXMEMBER_1];
CHARTYPE TR[MAXSYM];
char_tr(pat, m)
unsigned char *pat;
int *m;
{
int i;
unsigned char temp[MAXPATT];
for(i=0; i<MAXSYM; i++) TR[i] = i;
if(NOUPPER) {
for(i='A'; i<= 'Z'; i++) TR[i] = i + 'a' - 'A';
}
if(WORDBOUND) { /* SUN: To be added to be more complete */
TR['\n'] = TR['\t'] = TR[' '] = TR[','] = TR[';'] = TR[':'] =
TR['!'] = TR['"'] = TR['?'] = TR['<'] = TR['>'] =
TR['='] = TR['['] = TR[']'] = TR['\''] =
TR['('] = TR[')'] = TR['$'] = TR['/'] = TR['\\'] = W_DELIM;
}
if(WHOLELINE) {
memcpy(temp, pat, *m);
pat[0] = '\n';
memcpy(pat+1, temp, *m);
pat[*m+1] = '\n';
pat[*m+2] = 0;
*m = *m + 2;
}
}
sgrep(pat, m, fd, D)
CHARTYPE *pat; int fd, m, D;
{
CHARTYPE text[BLOCKSIZE+2*MAXLINE+MAXPATT]; /* input text stream */
int offset = 2*MAXLINE;
int buf_end, num_read, i, start, end, residue = 0;
if(pat[0] == '^' || pat[0] == '$') pat[0] = '\n';
if(pat[m-1] == '^' || pat[m-1] == '$') pat[m-1] = '\n';
char_tr(pat, &m); /* will change pat, and m if WHOLELINE is ON */
text[offset-1] = '\n'; /* initial case */
for(i=0; i < MAXLINE; i++) text[i] = 0; /* security zone */
start = offset;
if(WHOLELINE) start--;
if(m >= MAXPATT) {
fprintf(stderr, "%s: pattern too long\n", Progname);
exit(2);
}
if(D == 0) {
if(m > LONG_EXAC) m_preprocess(pat);
else prep_bm(pat, m);
}
else if (DNA) prep4(pat, m);
else if(m >= LONG_APPX) am_preprocess(pat);
else {
prep(pat, m, D);
initmask(pat, Mask, m, 0, &endposition);
}
for(i=1; i<=m; i++) text[BLOCKSIZE+offset+i] = pat[m-1];
/* to make sure the skip loop in bm() won't go out of bound */
while( (num_read = read(fd, text+offset, BLOCKSIZE)) > 0)
{
buf_end = end = offset + num_read -1 ;
if(num_read < BLOCKSIZE) if(text[end] != '\n') text[++end] = '\n';
while(text[end] != '\n' && end > offset) end--;
residue = buf_end - end + 1 ;
text[start-1] = '\n';
if(D==0) {
if(m > LONG_EXAC) monkey(pat, m, text+start, text+end);
else bm(pat, m, text+start, text+end);
}
else {
if(DNA) monkey4( pat, m, text+start, text+end, D );
else {
if(m >= LONG_APPX) a_monkey(pat, m, text+start, text+end, D);
else agrep(pat, m, text+start, text+end, D);
}
}
if(FILENAMEONLY && num_of_matched) {
printf("%s\n", CurrentFileName);
return; }
start = offset - residue ;
if(start < MAXLINE) {
start = MAXLINE;
}
strncpy(text+start, text+end, residue);
start++;
} /* end of while(num_read = ... */
return;
} /* end sgrep */
/* SUN: bm assumes that the content of text[n]...text[n+m-1] is
pat[m-1] such that the skip loop is guaranteed to terminated */
bm(pat, m, text, textend)
CHARTYPE *text, *textend, *pat; int m;
{
register int shift;
register int m1, j, d1;
/*
printf("%d\t", textend - text);
printf("%c, %c", *text, *textend);
*/
d1 = shift_1; /* at least 1 */
m1 = m - 1;
shift = 0;
while (text <= textend) {
shift = SHIFT[*(text += shift)];
while(shift) {
shift = SHIFT[*(text += shift)];
shift = SHIFT[*(text += shift)];
shift = SHIFT[*(text += shift)];
}
j = 0;
while(TR[pat[m1 - j]] == TR[*(text - j)]) {
if(++j == m) break; /* if statement can be
saved, but for safty ... */
}
if (j == m ) {
if(text > textend) return;
if(WORDBOUND) {
if(TR[*(text+1)] != W_DELIM) goto CONT;
if(TR[*(text-m)] != W_DELIM) goto CONT;
}
num_of_matched++;
if(FILENAMEONLY) return;
if(!(COUNT)) {
if(FNAME) printf("%s: ", CurrentFileName);
while(*(--text) != '\n');
while(*(++text) != '\n') putchar(*(text));
putchar(*text);
}
else { while(*text != '\n') text++; }
CONT:
shift = 1;
}
else shift = d1;
}
return;
}
/* initmask() initializes the mask table for the pattern */
/* endposition is a mask for the endposition of the pattern */
/* endposition will contain k mask bits if the pattern contains k fragments */
initmask(pattern, Mask, m, D, endposition)
CHARTYPE *pattern; unsigned *Mask; register int m, D; unsigned *endposition;
{
register unsigned Bit1, c;
register int i, j, frag_num;
Bit1 = 1 << 31; /* the first bit of Bit1 is 1, others 0. */
frag_num = D+1; *endposition = 0;
for (i = 0; i < frag_num; i++) *endposition = *endposition | (Bit1 >> i);
*endposition = *endposition >> (m - frag_num);
for(i = 0; i < m; i++)
if (pattern[i] == '^' || pattern[i] == '$') {
pattern[i] = '\n';
}
for(i = 0; i < MAXSYM; i++) Mask[i] = ~0;
for(i = 0; i < m; i++) /* initialize the mask table */
{ c = pattern[i];
for ( j = 0; j < m; j++)
if( c == pattern[j] )
Mask[c] = Mask[c] & ~( Bit1 >> j ) ;
}
}
prep(Pattern, M, D) /* preprocessing for partitioning_bm */
CHARTYPE *Pattern; /* can be fine-tuned to choose a better partition */
register int M, D;
{
register int i, j, k, p, shift;
register unsigned m;
unsigned hash, b_size = 3;
m = M/(D+1);
p = M - m*(D+1);
for (i = 0; i < MAXSYM; i++) SHIFT[i] = m;
for (i = M-1; i>=p ; i--) {
shift = (M-1-i)%m;
hash = Pattern[i];
if(SHIFT[hash] > shift) SHIFT[hash] = shift;
}
#ifdef DEBUG
for(i=0; i<M; i++) printf(" %d,", SHIFT[Pattern[i]]);
printf("\n");
#endif
shift_1 = m;
for(i=0; i<D+1; i++) {
j = M-1 - m*i;
for(k=1; k<m; k++) {
for(p=0; p<D+1; p++)
if(Pattern[j-k] == Pattern[M-1-m*p])
if(k < shift_1) shift_1 = k;
}
}
#ifdef DEBUG
printf("\nshift_1 = %d", shift_1);
#endif
if(shift_1 == 0) shift_1 = 1;
for(i=0; i<MAXMEMBER; i++) MEMBER[i] = 0;
if (m < 3) b_size = m;
for(i=0; i<D+1; i++) {
j = M-1 - m*i;
hash = 0;
for(k=0; k<b_size; k++) {
hash = (hash << 2) + Pattern[j-k];
}
#ifdef DEBUG
printf(" hash = %d,", hash);
#endif
MEMBER[hash] = 1;
}
}
agrep( pat, M, text, textend, D )
int M, D ; register CHARTYPE *text, *textend, *pat;
{
register int i;
register int m = M/(D+1);
register CHARTYPE *textstart;
register int shift, HASH;
int j=0, k, m1, d1;
int n, cdx;
int Candidate[MaxCan][2], round, lastend=0;
unsigned R1[MaxError+1], R2[MaxError+1];
register unsigned int r1, endpos, c;
unsigned currentpos;
unsigned Bit1;
unsigned r_newline;
Candidate[0][0] = Candidate[0][1] = 0;
d1 = shift_1;
cdx = 0;
if(m < 3) r1 = m;
else r1 = 3;
textstart = text;
shift = m-1;
while (text < textend) {
shift = SHIFT[*(text += shift)];
while(shift) {
shift = SHIFT[*(text += shift)];
shift = SHIFT[*(text += shift)];
}
j = 1; HASH = *text;
while(j < r1) { HASH = (HASH << 2) + *(text-j);
j++; }
if (MEMBER[HASH]) {
i = text - textstart;
if((i - M - D - 10) > Candidate